Binary Classification of Breast Cancer & Visualization for Doctor

Data Importing & Cleaning & Inspecting

Import dataset

data <-  read.csv('C:/Users/patel/Desktop/Main folder/Brest cancer/data.csv')
head(data)
##         id diagnosis radius_mean texture_mean perimeter_mean area_mean
## 1   842302         M       17.99        10.38         122.80    1001.0
## 2   842517         M       20.57        17.77         132.90    1326.0
## 3 84300903         M       19.69        21.25         130.00    1203.0
## 4 84348301         M       11.42        20.38          77.58     386.1
## 5 84358402         M       20.29        14.34         135.10    1297.0
## 6   843786         M       12.45        15.70          82.57     477.1
##   smoothness_mean compactness_mean concavity_mean concave.points_mean
## 1         0.11840          0.27760         0.3001             0.14710
## 2         0.08474          0.07864         0.0869             0.07017
## 3         0.10960          0.15990         0.1974             0.12790
## 4         0.14250          0.28390         0.2414             0.10520
## 5         0.10030          0.13280         0.1980             0.10430
## 6         0.12780          0.17000         0.1578             0.08089
##   symmetry_mean fractal_dimension_mean radius_se texture_se perimeter_se
## 1        0.2419                0.07871    1.0950     0.9053        8.589
## 2        0.1812                0.05667    0.5435     0.7339        3.398
## 3        0.2069                0.05999    0.7456     0.7869        4.585
## 4        0.2597                0.09744    0.4956     1.1560        3.445
## 5        0.1809                0.05883    0.7572     0.7813        5.438
## 6        0.2087                0.07613    0.3345     0.8902        2.217
##   area_se smoothness_se compactness_se concavity_se concave.points_se
## 1  153.40      0.006399        0.04904      0.05373           0.01587
## 2   74.08      0.005225        0.01308      0.01860           0.01340
## 3   94.03      0.006150        0.04006      0.03832           0.02058
## 4   27.23      0.009110        0.07458      0.05661           0.01867
## 5   94.44      0.011490        0.02461      0.05688           0.01885
## 6   27.19      0.007510        0.03345      0.03672           0.01137
##   symmetry_se fractal_dimension_se radius_worst texture_worst
## 1     0.03003             0.006193        25.38         17.33
## 2     0.01389             0.003532        24.99         23.41
## 3     0.02250             0.004571        23.57         25.53
## 4     0.05963             0.009208        14.91         26.50
## 5     0.01756             0.005115        22.54         16.67
## 6     0.02165             0.005082        15.47         23.75
##   perimeter_worst area_worst smoothness_worst compactness_worst
## 1          184.60     2019.0           0.1622            0.6656
## 2          158.80     1956.0           0.1238            0.1866
## 3          152.50     1709.0           0.1444            0.4245
## 4           98.87      567.7           0.2098            0.8663
## 5          152.20     1575.0           0.1374            0.2050
## 6          103.40      741.6           0.1791            0.5249
##   concavity_worst concave.points_worst symmetry_worst
## 1          0.7119               0.2654         0.4601
## 2          0.2416               0.1860         0.2750
## 3          0.4504               0.2430         0.3613
## 4          0.6869               0.2575         0.6638
## 5          0.4000               0.1625         0.2364
## 6          0.5355               0.1741         0.3985
##   fractal_dimension_worst  X
## 1                 0.11890 NA
## 2                 0.08902 NA
## 3                 0.08758 NA
## 4                 0.17300 NA
## 5                 0.07678 NA
## 6                 0.12440 NA

Remove NULL Data

data$X <-  NULL

Reshape the datasets

data <- data[,-1]
data$diagnosis <-  as.factor(ifelse(data$diagnosis =='B',"Benign","Malignant")) 

Inspect the datasets

summary(data)
##      diagnosis    radius_mean      texture_mean   perimeter_mean  
##  Benign   :357   Min.   : 6.981   Min.   : 9.71   Min.   : 43.79  
##  Malignant:212   1st Qu.:11.700   1st Qu.:16.17   1st Qu.: 75.17  
##                  Median :13.370   Median :18.84   Median : 86.24  
##                  Mean   :14.127   Mean   :19.29   Mean   : 91.97  
##                  3rd Qu.:15.780   3rd Qu.:21.80   3rd Qu.:104.10  
##                  Max.   :28.110   Max.   :39.28   Max.   :188.50  
##    area_mean      smoothness_mean   compactness_mean  concavity_mean   
##  Min.   : 143.5   Min.   :0.05263   Min.   :0.01938   Min.   :0.00000  
##  1st Qu.: 420.3   1st Qu.:0.08637   1st Qu.:0.06492   1st Qu.:0.02956  
##  Median : 551.1   Median :0.09587   Median :0.09263   Median :0.06154  
##  Mean   : 654.9   Mean   :0.09636   Mean   :0.10434   Mean   :0.08880  
##  3rd Qu.: 782.7   3rd Qu.:0.10530   3rd Qu.:0.13040   3rd Qu.:0.13070  
##  Max.   :2501.0   Max.   :0.16340   Max.   :0.34540   Max.   :0.42680  
##  concave.points_mean symmetry_mean    fractal_dimension_mean
##  Min.   :0.00000     Min.   :0.1060   Min.   :0.04996       
##  1st Qu.:0.02031     1st Qu.:0.1619   1st Qu.:0.05770       
##  Median :0.03350     Median :0.1792   Median :0.06154       
##  Mean   :0.04892     Mean   :0.1812   Mean   :0.06280       
##  3rd Qu.:0.07400     3rd Qu.:0.1957   3rd Qu.:0.06612       
##  Max.   :0.20120     Max.   :0.3040   Max.   :0.09744       
##    radius_se        texture_se      perimeter_se       area_se       
##  Min.   :0.1115   Min.   :0.3602   Min.   : 0.757   Min.   :  6.802  
##  1st Qu.:0.2324   1st Qu.:0.8339   1st Qu.: 1.606   1st Qu.: 17.850  
##  Median :0.3242   Median :1.1080   Median : 2.287   Median : 24.530  
##  Mean   :0.4052   Mean   :1.2169   Mean   : 2.866   Mean   : 40.337  
##  3rd Qu.:0.4789   3rd Qu.:1.4740   3rd Qu.: 3.357   3rd Qu.: 45.190  
##  Max.   :2.8730   Max.   :4.8850   Max.   :21.980   Max.   :542.200  
##  smoothness_se      compactness_se      concavity_se    
##  Min.   :0.001713   Min.   :0.002252   Min.   :0.00000  
##  1st Qu.:0.005169   1st Qu.:0.013080   1st Qu.:0.01509  
##  Median :0.006380   Median :0.020450   Median :0.02589  
##  Mean   :0.007041   Mean   :0.025478   Mean   :0.03189  
##  3rd Qu.:0.008146   3rd Qu.:0.032450   3rd Qu.:0.04205  
##  Max.   :0.031130   Max.   :0.135400   Max.   :0.39600  
##  concave.points_se   symmetry_se       fractal_dimension_se
##  Min.   :0.000000   Min.   :0.007882   Min.   :0.0008948   
##  1st Qu.:0.007638   1st Qu.:0.015160   1st Qu.:0.0022480   
##  Median :0.010930   Median :0.018730   Median :0.0031870   
##  Mean   :0.011796   Mean   :0.020542   Mean   :0.0037949   
##  3rd Qu.:0.014710   3rd Qu.:0.023480   3rd Qu.:0.0045580   
##  Max.   :0.052790   Max.   :0.078950   Max.   :0.0298400   
##   radius_worst   texture_worst   perimeter_worst    area_worst    
##  Min.   : 7.93   Min.   :12.02   Min.   : 50.41   Min.   : 185.2  
##  1st Qu.:13.01   1st Qu.:21.08   1st Qu.: 84.11   1st Qu.: 515.3  
##  Median :14.97   Median :25.41   Median : 97.66   Median : 686.5  
##  Mean   :16.27   Mean   :25.68   Mean   :107.26   Mean   : 880.6  
##  3rd Qu.:18.79   3rd Qu.:29.72   3rd Qu.:125.40   3rd Qu.:1084.0  
##  Max.   :36.04   Max.   :49.54   Max.   :251.20   Max.   :4254.0  
##  smoothness_worst  compactness_worst concavity_worst  concave.points_worst
##  Min.   :0.07117   Min.   :0.02729   Min.   :0.0000   Min.   :0.00000     
##  1st Qu.:0.11660   1st Qu.:0.14720   1st Qu.:0.1145   1st Qu.:0.06493     
##  Median :0.13130   Median :0.21190   Median :0.2267   Median :0.09993     
##  Mean   :0.13237   Mean   :0.25427   Mean   :0.2722   Mean   :0.11461     
##  3rd Qu.:0.14600   3rd Qu.:0.33910   3rd Qu.:0.3829   3rd Qu.:0.16140     
##  Max.   :0.22260   Max.   :1.05800   Max.   :1.2520   Max.   :0.29100     
##  symmetry_worst   fractal_dimension_worst
##  Min.   :0.1565   Min.   :0.05504        
##  1st Qu.:0.2504   1st Qu.:0.07146        
##  Median :0.2822   Median :0.08004        
##  Mean   :0.2901   Mean   :0.08395        
##  3rd Qu.:0.3179   3rd Qu.:0.09208        
##  Max.   :0.6638   Max.   :0.20750
str(data) 
## 'data.frame':    569 obs. of  31 variables:
##  $ diagnosis              : Factor w/ 2 levels "Benign","Malignant": 2 2 2 2 2 2 2 2 2 2 ...
##  $ radius_mean            : num  18 20.6 19.7 11.4 20.3 ...
##  $ texture_mean           : num  10.4 17.8 21.2 20.4 14.3 ...
##  $ perimeter_mean         : num  122.8 132.9 130 77.6 135.1 ...
##  $ area_mean              : num  1001 1326 1203 386 1297 ...
##  $ smoothness_mean        : num  0.1184 0.0847 0.1096 0.1425 0.1003 ...
##  $ compactness_mean       : num  0.2776 0.0786 0.1599 0.2839 0.1328 ...
##  $ concavity_mean         : num  0.3001 0.0869 0.1974 0.2414 0.198 ...
##  $ concave.points_mean    : num  0.1471 0.0702 0.1279 0.1052 0.1043 ...
##  $ symmetry_mean          : num  0.242 0.181 0.207 0.26 0.181 ...
##  $ fractal_dimension_mean : num  0.0787 0.0567 0.06 0.0974 0.0588 ...
##  $ radius_se              : num  1.095 0.543 0.746 0.496 0.757 ...
##  $ texture_se             : num  0.905 0.734 0.787 1.156 0.781 ...
##  $ perimeter_se           : num  8.59 3.4 4.58 3.44 5.44 ...
##  $ area_se                : num  153.4 74.1 94 27.2 94.4 ...
##  $ smoothness_se          : num  0.0064 0.00522 0.00615 0.00911 0.01149 ...
##  $ compactness_se         : num  0.049 0.0131 0.0401 0.0746 0.0246 ...
##  $ concavity_se           : num  0.0537 0.0186 0.0383 0.0566 0.0569 ...
##  $ concave.points_se      : num  0.0159 0.0134 0.0206 0.0187 0.0188 ...
##  $ symmetry_se            : num  0.03 0.0139 0.0225 0.0596 0.0176 ...
##  $ fractal_dimension_se   : num  0.00619 0.00353 0.00457 0.00921 0.00511 ...
##  $ radius_worst           : num  25.4 25 23.6 14.9 22.5 ...
##  $ texture_worst          : num  17.3 23.4 25.5 26.5 16.7 ...
##  $ perimeter_worst        : num  184.6 158.8 152.5 98.9 152.2 ...
##  $ area_worst             : num  2019 1956 1709 568 1575 ...
##  $ smoothness_worst       : num  0.162 0.124 0.144 0.21 0.137 ...
##  $ compactness_worst      : num  0.666 0.187 0.424 0.866 0.205 ...
##  $ concavity_worst        : num  0.712 0.242 0.45 0.687 0.4 ...
##  $ concave.points_worst   : num  0.265 0.186 0.243 0.258 0.163 ...
##  $ symmetry_worst         : num  0.46 0.275 0.361 0.664 0.236 ...
##  $ fractal_dimension_worst: num  0.1189 0.089 0.0876 0.173 0.0768 ...
head(data)
##   diagnosis radius_mean texture_mean perimeter_mean area_mean
## 1 Malignant       17.99        10.38         122.80    1001.0
## 2 Malignant       20.57        17.77         132.90    1326.0
## 3 Malignant       19.69        21.25         130.00    1203.0
## 4 Malignant       11.42        20.38          77.58     386.1
## 5 Malignant       20.29        14.34         135.10    1297.0
## 6 Malignant       12.45        15.70          82.57     477.1
##   smoothness_mean compactness_mean concavity_mean concave.points_mean
## 1         0.11840          0.27760         0.3001             0.14710
## 2         0.08474          0.07864         0.0869             0.07017
## 3         0.10960          0.15990         0.1974             0.12790
## 4         0.14250          0.28390         0.2414             0.10520
## 5         0.10030          0.13280         0.1980             0.10430
## 6         0.12780          0.17000         0.1578             0.08089
##   symmetry_mean fractal_dimension_mean radius_se texture_se perimeter_se
## 1        0.2419                0.07871    1.0950     0.9053        8.589
## 2        0.1812                0.05667    0.5435     0.7339        3.398
## 3        0.2069                0.05999    0.7456     0.7869        4.585
## 4        0.2597                0.09744    0.4956     1.1560        3.445
## 5        0.1809                0.05883    0.7572     0.7813        5.438
## 6        0.2087                0.07613    0.3345     0.8902        2.217
##   area_se smoothness_se compactness_se concavity_se concave.points_se
## 1  153.40      0.006399        0.04904      0.05373           0.01587
## 2   74.08      0.005225        0.01308      0.01860           0.01340
## 3   94.03      0.006150        0.04006      0.03832           0.02058
## 4   27.23      0.009110        0.07458      0.05661           0.01867
## 5   94.44      0.011490        0.02461      0.05688           0.01885
## 6   27.19      0.007510        0.03345      0.03672           0.01137
##   symmetry_se fractal_dimension_se radius_worst texture_worst
## 1     0.03003             0.006193        25.38         17.33
## 2     0.01389             0.003532        24.99         23.41
## 3     0.02250             0.004571        23.57         25.53
## 4     0.05963             0.009208        14.91         26.50
## 5     0.01756             0.005115        22.54         16.67
## 6     0.02165             0.005082        15.47         23.75
##   perimeter_worst area_worst smoothness_worst compactness_worst
## 1          184.60     2019.0           0.1622            0.6656
## 2          158.80     1956.0           0.1238            0.1866
## 3          152.50     1709.0           0.1444            0.4245
## 4           98.87      567.7           0.2098            0.8663
## 5          152.20     1575.0           0.1374            0.2050
## 6          103.40      741.6           0.1791            0.5249
##   concavity_worst concave.points_worst symmetry_worst
## 1          0.7119               0.2654         0.4601
## 2          0.2416               0.1860         0.2750
## 3          0.4504               0.2430         0.3613
## 4          0.6869               0.2575         0.6638
## 5          0.4000               0.1625         0.2364
## 6          0.5355               0.1741         0.3985
##   fractal_dimension_worst
## 1                 0.11890
## 2                 0.08902
## 3                 0.08758
## 4                 0.17300
## 5                 0.07678
## 6                 0.12440

Analyze the Correlation between variables

Correlation between each variables

mean

library(PerformanceAnalytics)
## Warning: package 'PerformanceAnalytics' was built under R version 3.4.4
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
## 
##     legend
chart.Correlation(data[,c(2:11)],histogram = T, col="blue",main ="Cancer Mean")

se

chart.Correlation(data[,c(12:21)], method="pearson",hist.col = "#1fbbfa",main="Cancer SE")

worst

chart.Correlation(data[,c(22:31)], method="pearson",hist.col = "#1fbbfa",main="Cancer worst")

See the relation between each variables (diagnosis included)

mean

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4
library(GGally)
## Warning: package 'GGally' was built under R version 3.4.4
ggpairs(data[,c(2:11,1)], aes(color=diagnosis, alpha=0.75), lower=list(continuous="smooth"))+ theme_bw()+
  labs(title="Cancer Mean")+
  theme(plot.title=element_text(face='bold',color='black',hjust=0.5,size=12))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

se

ggpairs(data[,c(12:21,1)], aes(color=diagnosis, alpha=0.75), lower=list(continuous="smooth"))+ theme_bw()+
  labs(title="Cancer SE")+
  theme(plot.title=element_text(face='bold',color='black',hjust=0.5,size=12))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

worst

ggpairs(data[,c(22:31,1)], aes(color=diagnosis, alpha=0.75), lower=list(continuous="smooth"))+ theme_bw()+
  labs(title="Cancer Worst")+
  theme(plot.title=element_text(face='bold',color='black',hjust=0.5,size=12))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Principal Component Analysis (PCA)

PCA uses standardized data so that it can avoid data distortion caused by scale difference.

library(factoextra)
## Warning: package 'factoextra' was built under R version 3.4.4
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
data_pca <- transform(data)

Summary

all

all_pca <-  prcomp(data_pca[,-1],cor=TRUE,scale= TRUE)
## Warning: In prcomp.default(data_pca[, -1], cor = TRUE, scale = TRUE) :
##  extra argument 'cor' will be disregarded
summary(all_pca)
## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5     PC6
## Standard deviation     3.6444 2.3857 1.67867 1.40735 1.28403 1.09880
## Proportion of Variance 0.4427 0.1897 0.09393 0.06602 0.05496 0.04025
## Cumulative Proportion  0.4427 0.6324 0.72636 0.79239 0.84734 0.88759
##                            PC7     PC8    PC9    PC10   PC11    PC12
## Standard deviation     0.82172 0.69037 0.6457 0.59219 0.5421 0.51104
## Proportion of Variance 0.02251 0.01589 0.0139 0.01169 0.0098 0.00871
## Cumulative Proportion  0.91010 0.92598 0.9399 0.95157 0.9614 0.97007
##                           PC13    PC14    PC15    PC16    PC17    PC18
## Standard deviation     0.49128 0.39624 0.30681 0.28260 0.24372 0.22939
## Proportion of Variance 0.00805 0.00523 0.00314 0.00266 0.00198 0.00175
## Cumulative Proportion  0.97812 0.98335 0.98649 0.98915 0.99113 0.99288
##                           PC19    PC20   PC21    PC22    PC23   PC24
## Standard deviation     0.22244 0.17652 0.1731 0.16565 0.15602 0.1344
## Proportion of Variance 0.00165 0.00104 0.0010 0.00091 0.00081 0.0006
## Cumulative Proportion  0.99453 0.99557 0.9966 0.99749 0.99830 0.9989
##                           PC25    PC26    PC27    PC28    PC29    PC30
## Standard deviation     0.12442 0.09043 0.08307 0.03987 0.02736 0.01153
## Proportion of Variance 0.00052 0.00027 0.00023 0.00005 0.00002 0.00000
## Cumulative Proportion  0.99942 0.99969 0.99992 0.99997 1.00000 1.00000

mean

mean_pca <-  prcomp(data_pca[,c(2:11)], cor = TRUE , scale = TRUE)
## Warning: In prcomp.default(data_pca[, c(2:11)], cor = TRUE, scale = TRUE) :
##  extra argument 'cor' will be disregarded
summary(mean_pca)
## Importance of components:
##                           PC1    PC2     PC3    PC4     PC5     PC6
## Standard deviation     2.3406 1.5870 0.93841 0.7064 0.61036 0.35234
## Proportion of Variance 0.5479 0.2519 0.08806 0.0499 0.03725 0.01241
## Cumulative Proportion  0.5479 0.7997 0.88779 0.9377 0.97495 0.98736
##                            PC7     PC8     PC9    PC10
## Standard deviation     0.28299 0.18679 0.10552 0.01680
## Proportion of Variance 0.00801 0.00349 0.00111 0.00003
## Cumulative Proportion  0.99537 0.99886 0.99997 1.00000

se

se_pca <-  prcomp(data_pca[,c(12:21)], cor = TRUE , scale = TRUE)
## Warning: In prcomp.default(data_pca[, c(12:21)], cor = TRUE, scale = TRUE) :
##  extra argument 'cor' will be disregarded
summary(se_pca)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6
## Standard deviation     2.1779 1.4406 1.1245 0.77095 0.75991 0.57939
## Proportion of Variance 0.4743 0.2075 0.1264 0.05944 0.05775 0.03357
## Cumulative Proportion  0.4743 0.6819 0.8083 0.86774 0.92548 0.95905
##                            PC7    PC8     PC9    PC10
## Standard deviation     0.43512 0.3962 0.20436 0.14635
## Proportion of Variance 0.01893 0.0157 0.00418 0.00214
## Cumulative Proportion  0.97798 0.9937 0.99786 1.00000

worst

worst_pca <- prcomp(data_pca[,c(22:31)], cor = TRUE , scale = TRUE)
## Warning: In prcomp.default(data_pca[, c(22:31)], cor = TRUE, scale = TRUE) :
##  extra argument 'cor' will be disregarded
summary(worst_pca)
## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5     PC6
## Standard deviation     2.3869 1.4443 0.89597 0.73531 0.71741 0.42862
## Proportion of Variance 0.5697 0.2086 0.08028 0.05407 0.05147 0.01837
## Cumulative Proportion  0.5697 0.7783 0.85860 0.91267 0.96413 0.98251
##                            PC7     PC8     PC9    PC10
## Standard deviation     0.28959 0.26802 0.12343 0.06326
## Proportion of Variance 0.00839 0.00718 0.00152 0.00040
## Cumulative Proportion  0.99089 0.99808 0.99960 1.00000

Screeplot

all

fviz_eig(all_pca, addlabels = T , ylim = c(0,60), geom = c("bar","line"),
         barfill = 'Red', barcolor = "grey", linecolor = 'black', ncp=10)+
  labs(title = 'Cancer All variance -PCA',x ='Principle Components', y="% of variance")

mean

fviz_eig(mean_pca, addlabels = T , ylim = c(0,60), geom = c("bar","line"),
         barfill = 'Pink', barcolor = "grey", linecolor = 'black', ncp=10)+
  labs(title = 'Cancer All variance -PCA',x ='Principle Components', y="% of variance")

se

fviz_eig(se_pca, addlabels = T , ylim = c(0,60), geom = c("bar","line"),
         barfill = 'Pink', barcolor = "grey", linecolor = 'black', ncp=10)+
  labs(title = 'Cancer All variance -PCA',x ='Principle Components', y="% of variance")

worst

fviz_eig(worst_pca, addlabels = T , ylim = c(0,60), geom = c("bar","line"),
         barfill = 'Pink', barcolor = "grey", linecolor = 'black', ncp=10)+
  labs(title = 'Cancer All variance -PCA',x ='Principle Components', y="% of variance")

Get PCA Variables

all

all_var <-  get_pca_var(all_pca)
all_var
## Principal Component Analysis Results for variables
##  ===================================================
##   Name       Description                                    
## 1 "$coord"   "Coordinates for the variables"                
## 2 "$cor"     "Correlations between variables and dimensions"
## 3 "$cos2"    "Cos2 for the variables"                       
## 4 "$contrib" "contributions of the variables"

Quality of representation of PCA

Correlation between variables and PCA

library("corrplot")
## corrplot 0.84 loaded
corrplot(all_var$cos2, is.corr=FALSE)

Contributions of variables to PCA

To highlight the most contributing variables for each components

corrplot(all_var$contrib, is.corr=FALSE)  

Contributions of variables to PC1 & PC2

library(gridExtra)
p1 <- fviz_contrib(all_pca, choice="var", axes=1, fill="pink", color="grey", top=10)
p2 <- fviz_contrib(all_pca, choice="var", axes=2, fill="skyblue", color="grey", top=10)
grid.arrange(p1,p2,ncol=2)

mean

mean_var <- get_pca_var(mean_pca)
mean_var
## Principal Component Analysis Results for variables
##  ===================================================
##   Name       Description                                    
## 1 "$coord"   "Coordinates for the variables"                
## 2 "$cor"     "Correlations between variables and dimensions"
## 3 "$cos2"    "Cos2 for the variables"                       
## 4 "$contrib" "contributions of the variables"

Quality of representation of PCA

Correlation between variables and PCA

corrplot(mean_var$cos2, is.corr=FALSE)

Contributions of variables to PCA To highlight the most contributing variables for each components

corrplot(mean_var$contrib, is.corr=FALSE)  

Contributions of variables to PC1 & PC2

p1 <- fviz_contrib(mean_pca, choice="var", axes=1, fill="pink", color="grey", top=10)
p2 <- fviz_contrib(mean_pca, choice="var", axes=2, fill="skyblue", color="grey", top=10)
grid.arrange(p1,p2,ncol=2)

se

se_var <- get_pca_var(se_pca)
se_var
## Principal Component Analysis Results for variables
##  ===================================================
##   Name       Description                                    
## 1 "$coord"   "Coordinates for the variables"                
## 2 "$cor"     "Correlations between variables and dimensions"
## 3 "$cos2"    "Cos2 for the variables"                       
## 4 "$contrib" "contributions of the variables"

Quality of representation of PCA

Correlation between variables and PCA

corrplot(se_var$cos2, is.corr=FALSE)

Contributions of variables to PCA

To highlight the most contributing variables for each components

corrplot(se_var$contrib, is.corr=FALSE)

Contributions of variables to PC1 & PC2

p1 <- fviz_contrib(se_pca, choice="var", axes=1, fill="pink", color="grey", top=10)
p2 <- fviz_contrib(se_pca, choice="var", axes=2, fill="skyblue", color="grey", top=10)
grid.arrange(p1,p2,ncol=2)

worst

worst_var <- get_pca_var(worst_pca)
worst_var
## Principal Component Analysis Results for variables
##  ===================================================
##   Name       Description                                    
## 1 "$coord"   "Coordinates for the variables"                
## 2 "$cor"     "Correlations between variables and dimensions"
## 3 "$cos2"    "Cos2 for the variables"                       
## 4 "$contrib" "contributions of the variables"

Quality of representation of PCA

Correlation between variables and PCA

corrplot(worst_var$cos2, is.corr=FALSE)

Contributions of variables to PCA

To highlight the most contributing variables for each components

corrplot(worst_var$contrib, is.corr=FALSE)

Contributions of variables to PC1 & PC2

p1 <- fviz_contrib(worst_pca, choice="var", axes=1, fill="pink", color="grey", top=10)
p2 <- fviz_contrib(worst_pca, choice="var", axes=2, fill="skyblue", color="grey", top=10)
grid.arrange(p1,p2,ncol=2)

See the plot - color variables by groups

value centers : put the optimal principal component value that we chosen above.

all

set.seed(100)
res.all <-  kmeans(all_var$coord , centers = 6, nstart = 25)
grp <- as.factor(res.all$cluster)

fviz_pca_var(all_pca,col.var = grp, palette = "jco",legend.title = "Cluster")

mean

res.mean <-  kmeans(mean_var$coord , centers = 3, nstart = 25)
grp <- as.factor(res.mean$cluster)

fviz_pca_var(mean_pca,col.var = grp, palette = "jco",legend.title = "Cluster")

se

res.se <-  kmeans(se_var$coord , centers = 4, nstart = 25)
grp <- as.factor(res.se$cluster)

fviz_pca_var(se_pca,col.var = grp, palette = "jco",legend.title = "Cluster")

worst

res.worst <-  kmeans(worst_var$coord , centers = 3, nstart = 25)
grp <- as.factor(res.worst$cluster)

fviz_pca_var(worst_pca,col.var = grp, palette = "jco",legend.title = "Cluster")

See the Biplot

all

fviz_pca_biplot(all_pca,col.ind = data$diagnosis,col = "black",
                palette = "jco",geom = "point",repel = T,
                legend.title = "Diagnosis",addEllipses = T)

mean

fviz_pca_biplot(mean_pca,col.ind = data$diagnosis,col = "black",
                palette = "jco",geom = "point",repel = T,
                legend.title = "Diagnosis",addEllipses = T)

se

fviz_pca_biplot(se_pca,col.ind = data$diagnosis,col = "black",
                palette = "jco",geom = "point",repel = T,
                legend.title = "Diagnosis",addEllipses = T)

worst

fviz_pca_biplot(worst_pca,col.ind = data$diagnosis,col = "black",
                palette = "jco",geom = "point",repel = T,
                legend.title = "Diagnosis",addEllipses = T)

Apply every ML methods and compare each other and choose best fits

Make test & train dataset for testing classification ML methods

Shuffle the wbcd data(100%) & Make train dataset(70%), test dataset(30%)

nrows <- NROW(data)
set.seed(100)
index <- sample(1:nrows,0.7*nrows)
train <- data[index,]                   ## 398 test data (70%)
test <- data[-index,]

Check the proportion of diagnosis (Benign / Malignant)

library(rpart)
library(caret)
## Warning: package 'caret' was built under R version 3.4.4
## Loading required package: lattice
learn_rp <- rpart(diagnosis~.,data = train, control = rpart.control(minsplit = 2))
pre_ro <- predict(learn_rp,test[,-1],type = "class")
cm_rp <- confusionMatrix(pre_ro, test$diagnosis)
cm_rp
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign        98         7
##   Malignant      7        59
##                                           
##                Accuracy : 0.9181          
##                  95% CI : (0.8664, 0.9545)
##     No Information Rate : 0.614           
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8273          
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9333          
##             Specificity : 0.8939          
##          Pos Pred Value : 0.9333          
##          Neg Pred Value : 0.8939          
##              Prevalence : 0.6140          
##          Detection Rate : 0.5731          
##    Detection Prevalence : 0.6140          
##       Balanced Accuracy : 0.9136          
##                                           
##        'Positive' Class : Benign          
## 

Apply every ML methods to data

rpart

library(rpart)
library(caret)
learn_rp <- rpart(diagnosis~.,data = train, control = rpart.control(minsplit = 2))
pre_ro <- predict(learn_rp,test[,-1],type = "class")
cm_rp <- confusionMatrix(pre_ro, test$diagnosis)
cm_rp
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign        98         7
##   Malignant      7        59
##                                           
##                Accuracy : 0.9181          
##                  95% CI : (0.8664, 0.9545)
##     No Information Rate : 0.614           
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8273          
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9333          
##             Specificity : 0.8939          
##          Pos Pred Value : 0.9333          
##          Neg Pred Value : 0.8939          
##              Prevalence : 0.6140          
##          Detection Rate : 0.5731          
##    Detection Prevalence : 0.6140          
##       Balanced Accuracy : 0.9136          
##                                           
##        'Positive' Class : Benign          
## 

prune

learn_pru <- prune(learn_rp,cp =learn_rp$cptable[which.min(learn_rp$cptable[,"xerror"]),"CP"])
pre_pru <- predict(learn_pru,test[,-1],type = "class")
cm_pru <- confusionMatrix(pre_pru, test$diagnosis)
cm_pru
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign        91         6
##   Malignant     14        60
##                                           
##                Accuracy : 0.883           
##                  95% CI : (0.8252, 0.9271)
##     No Information Rate : 0.614           
##     P-Value [Acc > NIR] : 4.171e-15       
##                                           
##                   Kappa : 0.7587          
##  Mcnemar's Test P-Value : 0.1175          
##                                           
##             Sensitivity : 0.8667          
##             Specificity : 0.9091          
##          Pos Pred Value : 0.9381          
##          Neg Pred Value : 0.8108          
##              Prevalence : 0.6140          
##          Detection Rate : 0.5322          
##    Detection Prevalence : 0.5673          
##       Balanced Accuracy : 0.8879          
##                                           
##        'Positive' Class : Benign          
## 

OneR

library("RWeka")
## Warning: package 'RWeka' was built under R version 3.4.4
learn_1r <- OneR(diagnosis~., data=train)
pre_1r <- predict(learn_1r, test[,-1])
cm_1r   <- confusionMatrix(pre_1r, test$diagnosis)
cm_1r
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign        96         8
##   Malignant      9        58
##                                          
##                Accuracy : 0.9006         
##                  95% CI : (0.8456, 0.941)
##     No Information Rate : 0.614          
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.7908         
##  Mcnemar's Test P-Value : 1              
##                                          
##             Sensitivity : 0.9143         
##             Specificity : 0.8788         
##          Pos Pred Value : 0.9231         
##          Neg Pred Value : 0.8657         
##              Prevalence : 0.6140         
##          Detection Rate : 0.5614         
##    Detection Prevalence : 0.6082         
##       Balanced Accuracy : 0.8965         
##                                          
##        'Positive' Class : Benign         
## 

JRip

learn_jrip <- JRip(diagnosis ~ ., data=train)
pre_jrip <- predict(learn_jrip, test[,-1])
cm_jrip <- confusionMatrix(pre_jrip, test$diagnosis)        
cm_jrip
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign        96         3
##   Malignant      9        63
##                                           
##                Accuracy : 0.9298          
##                  95% CI : (0.8806, 0.9632)
##     No Information Rate : 0.614           
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8544          
##  Mcnemar's Test P-Value : 0.1489          
##                                           
##             Sensitivity : 0.9143          
##             Specificity : 0.9545          
##          Pos Pred Value : 0.9697          
##          Neg Pred Value : 0.8750          
##              Prevalence : 0.6140          
##          Detection Rate : 0.5614          
##    Detection Prevalence : 0.5789          
##       Balanced Accuracy : 0.9344          
##                                           
##        'Positive' Class : Benign          
## 

naivebayes

library(e1071)
## Warning: package 'e1071' was built under R version 3.4.4
## 
## Attaching package: 'e1071'
## The following objects are masked from 'package:PerformanceAnalytics':
## 
##     kurtosis, skewness
learn_nb <- naiveBayes(train[,-1], train$diagnosis)
pre_nb <- predict(learn_nb, test[,-1])
cm_nb <- confusionMatrix(pre_nb, test$diagnosis)        
cm_nb
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign       100         6
##   Malignant      5        60
##                                           
##                Accuracy : 0.9357          
##                  95% CI : (0.8878, 0.9675)
##     No Information Rate : 0.614           
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8639          
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9524          
##             Specificity : 0.9091          
##          Pos Pred Value : 0.9434          
##          Neg Pred Value : 0.9231          
##              Prevalence : 0.6140          
##          Detection Rate : 0.5848          
##    Detection Prevalence : 0.6199          
##       Balanced Accuracy : 0.9307          
##                                           
##        'Positive' Class : Benign          
## 

randomforest

library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
learn_rf <- randomForest(diagnosis~.,data=train,ntree=500,proximity=T,importance=T)
pre_rf <- predict(learn_rf,test[,-1])
cm_rf <- confusionMatrix(pre_rf,test$diagnosis)
cm_rf
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign       100         4
##   Malignant      5        62
##                                           
##                Accuracy : 0.9474          
##                  95% CI : (0.9024, 0.9757)
##     No Information Rate : 0.614           
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8893          
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9524          
##             Specificity : 0.9394          
##          Pos Pred Value : 0.9615          
##          Neg Pred Value : 0.9254          
##              Prevalence : 0.6140          
##          Detection Rate : 0.5848          
##    Detection Prevalence : 0.6082          
##       Balanced Accuracy : 0.9459          
##                                           
##        'Positive' Class : Benign          
## 

Ctree

library(party)
## Warning: package 'party' was built under R version 3.4.4
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 3.4.4
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 3.4.4
learn_ct <- ctree(diagnosis~., data=train, controls=ctree_control(maxdepth=2))
pre_ct   <- predict(learn_ct, test[,-1])
cm_ct    <- confusionMatrix(pre_ct, test$diagnosis)
cm_ct
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign        99         5
##   Malignant      6        61
##                                           
##                Accuracy : 0.9357          
##                  95% CI : (0.8878, 0.9675)
##     No Information Rate : 0.614           
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8647          
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9429          
##             Specificity : 0.9242          
##          Pos Pred Value : 0.9519          
##          Neg Pred Value : 0.9104          
##              Prevalence : 0.6140          
##          Detection Rate : 0.5789          
##    Detection Prevalence : 0.6082          
##       Balanced Accuracy : 0.9335          
##                                           
##        'Positive' Class : Benign          
## 

K-nn tune

library(class)

acc_test <- numeric() 

for(i in 1:30){
  predict <- knn(train=train[,-1], test=test[,-1], cl=train[,1], k=i, prob=T)
  acc_test <- c(acc_test,mean(predict==test[,1]))
}

acc <- data.frame(k= seq(1,30), cnt = acc_test)

opt_k <- subset(acc, cnt==max(cnt))[1,]
sub <- paste("Optimal number of k is", opt_k$k, "(accuracy :", opt_k$cnt,") in KNN")

library(highcharter)
## Warning: package 'highcharter' was built under R version 3.4.4
## Highcharts (www.highcharts.com) is a Highsoft software product which is
## not free for commercial and Governmental use
hchart(acc, 'line', hcaes(k, cnt)) %>%
  hc_title(text = "Accuracy With Varying K (KNN)") %>%
  hc_subtitle(text = sub) %>%
  hc_add_theme(hc_theme_google()) %>%
  hc_xAxis(title = list(text = "Number of Neighbors(k)")) %>%
  hc_yAxis(title = list(text = "Accuracy"))
pre_knn <- knn(train = train[,-1], test = test[,-1], cl = train[,1], k=opt_k$k, prob=T)
cm_knn  <- confusionMatrix(pre_knn, test$diagnosis)
cm_knn
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign        99         5
##   Malignant      6        61
##                                           
##                Accuracy : 0.9357          
##                  95% CI : (0.8878, 0.9675)
##     No Information Rate : 0.614           
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8647          
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9429          
##             Specificity : 0.9242          
##          Pos Pred Value : 0.9519          
##          Neg Pred Value : 0.9104          
##              Prevalence : 0.6140          
##          Detection Rate : 0.5789          
##    Detection Prevalence : 0.6082          
##       Balanced Accuracy : 0.9335          
##                                           
##        'Positive' Class : Benign          
## 

GBM

library(gbm)
## Warning: package 'gbm' was built under R version 3.4.4
## Loading required package: survival
## 
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
## 
##     cluster
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.3
test_gbm <- gbm(diagnosis~., data=train, distribution="gaussian",n.trees = 10000,
                shrinkage = 0.01, interaction.depth = 4, bag.fraction=0.5, 
                train.fraction=0.5,n.minobsinnode=10,cv.folds=3,keep.data=TRUE,verbose=FALSE,n.cores=1)
best.iter <- gbm.perf(test_gbm, method="cv",plot.it=FALSE)
fitControl = trainControl(method="cv", number=5, returnResamp="all")
learn_gbm = train(diagnosis~., data=train, method="gbm", distribution="bernoulli",
                  trControl=fitControl, verbose=F, tuneGrid=data.frame(.n.trees=best.iter, 
                                                                       .shrinkage=0.01, .interaction.depth=1,
                                                                       .n.minobsinnode=1))
pre_gbm <- predict(learn_gbm, test[,-1])
cm_gbm <- confusionMatrix(pre_gbm, test$diagnosis)
cm_gbm
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign        99         3
##   Malignant      6        63
##                                           
##                Accuracy : 0.9474          
##                  95% CI : (0.9024, 0.9757)
##     No Information Rate : 0.614           
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8899          
##  Mcnemar's Test P-Value : 0.505           
##                                           
##             Sensitivity : 0.9429          
##             Specificity : 0.9545          
##          Pos Pred Value : 0.9706          
##          Neg Pred Value : 0.9130          
##              Prevalence : 0.6140          
##          Detection Rate : 0.5789          
##    Detection Prevalence : 0.5965          
##       Balanced Accuracy : 0.9487          
##                                           
##        'Positive' Class : Benign          
## 

adaBoost

library(rpart)
library(ada)
## Warning: package 'ada' was built under R version 3.4.4
control <- rpart.control(cp = -1, maxdepth = 14,maxcompete = 1,xval = 0)
learn_ada <- ada(diagnosis~., data = train, test.x = train[,-1], test.y = train[,1], type = "gentle", control = control, iter = 70)
pre_ada <- predict(learn_ada, test[,-1])
cm_ada <- confusionMatrix(pre_ada, test$diagnosis)
cm_ada
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign       101         4
##   Malignant      4        62
##                                           
##                Accuracy : 0.9532          
##                  95% CI : (0.9099, 0.9796)
##     No Information Rate : 0.614           
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9013          
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9619          
##             Specificity : 0.9394          
##          Pos Pred Value : 0.9619          
##          Neg Pred Value : 0.9394          
##              Prevalence : 0.6140          
##          Detection Rate : 0.5906          
##    Detection Prevalence : 0.6140          
##       Balanced Accuracy : 0.9506          
##                                           
##        'Positive' Class : Benign          
## 

SVm

learn_svm <- svm(diagnosis~., data=train)
pre_svm <- predict(learn_svm, test[,-1])
cm_svm <- confusionMatrix(pre_svm, test$diagnosis)
cm_svm
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign       101         2
##   Malignant      4        64
##                                          
##                Accuracy : 0.9649         
##                  95% CI : (0.9252, 0.987)
##     No Information Rate : 0.614          
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.9264         
##  Mcnemar's Test P-Value : 0.6831         
##                                          
##             Sensitivity : 0.9619         
##             Specificity : 0.9697         
##          Pos Pred Value : 0.9806         
##          Neg Pred Value : 0.9412         
##              Prevalence : 0.6140         
##          Detection Rate : 0.5906         
##    Detection Prevalence : 0.6023         
##       Balanced Accuracy : 0.9658         
##                                          
##        'Positive' Class : Benign         
## 

SVM- tune

gamma <- seq(0,0.1,0.005)
cost <- 2^(0:5)
parms <- expand.grid(cost=cost, gamma=gamma)    ## 231

acc_test <- numeric()
accuracy1 <- NULL; accuracy2 <- NULL

for(i in 1:NROW(parms)){        
  learn_svm <- svm(diagnosis~., data=train, gamma=parms$gamma[i], cost=parms$cost[i])
  pre_svm <- predict(learn_svm, test[,-1])
  accuracy1 <- confusionMatrix(pre_svm, test$diagnosis)
  accuracy2[i] <- accuracy1$overall[1]
}

acc <- data.frame(p= seq(1,NROW(parms)), cnt = accuracy2)

opt_p <- subset(acc, cnt==max(cnt))[1,]
sub <- paste("Optimal number of parameter is", opt_p$p, "(accuracy :", opt_p$cnt,") in SVM")

library(highcharter)
hchart(acc, 'line', hcaes(p, cnt)) %>%
  hc_title(text = "Accuracy With Varying Parameters (SVM)") %>%
  hc_subtitle(text = sub) %>%
  hc_add_theme(hc_theme_google()) %>%
  hc_xAxis(title = list(text = "Number of Parameters")) %>%
  hc_yAxis(title = list(text = "Accuracy"))
learn_imp_svm <- svm(diagnosis~., data=train, cost=parms$cost[opt_p$p], gamma=parms$gamma[opt_p$p])
pre_imp_svm <- predict(learn_imp_svm, test[,-1])
cm_imp_svm <- confusionMatrix(pre_imp_svm, test$diagnosis)
cm_imp_svm
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign       104         2
##   Malignant      1        64
##                                           
##                Accuracy : 0.9825          
##                  95% CI : (0.9496, 0.9964)
##     No Information Rate : 0.614           
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9629          
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9905          
##             Specificity : 0.9697          
##          Pos Pred Value : 0.9811          
##          Neg Pred Value : 0.9846          
##              Prevalence : 0.6140          
##          Detection Rate : 0.6082          
##    Detection Prevalence : 0.6199          
##       Balanced Accuracy : 0.9801          
##                                           
##        'Positive' Class : Benign          
## 

Visualize to compare the accuracy of all methods

col <- c("#ed3b3b", "#0099ff")
par(mfrow=c(3,4))
fourfoldplot(cm_rp$table, color = col, conf.level = 0, margin = 1,
             main=paste("RPart (",round(cm_rp$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_pru$table, color = col, conf.level = 0, margin = 1,
             main=paste("Prune (",round(cm_pru$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_1r$table, color = col, conf.level = 0, margin = 1, 
             main=paste("OneR (",round(cm_1r$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_jrip$table, color = col, conf.level = 0, margin = 1,
             main=paste("JRip (",round(cm_jrip$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_ct$table, color = col, conf.level = 0, margin = 1, 
             main=paste("CTree (",round(cm_ct$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_nb$table, color = col, conf.level = 0, margin = 1,
             main=paste("NaiveBayes (",round(cm_nb$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_knn$table, color = col, conf.level = 0, margin = 1, 
             main=paste("Tune KNN (",round(cm_knn$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_rf$table, color = col, conf.level = 0, margin = 1, 
             main=paste("RandomForest (",round(cm_rf$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_gbm$table, color = col, conf.level = 0, margin = 1, 
             main=paste("GBM (",round(cm_gbm$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_ada$table, color = col, conf.level = 0, margin = 1, 
             main=paste("AdaBoost (",round(cm_ada$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_svm$table, color = col, conf.level = 0, margin = 1, 
             main=paste("SVM (",round(cm_svm$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_imp_svm$table, color = col, conf.level = 0, margin = 1, 
             main=paste("Tune SVM (",round(cm_imp_svm$overall[1]*100),"%)",sep=""))